ANALYSIS OF CUSTOMER LOAN REPAYMENT HISTORY AND
PREDICTING CREDIT RISK SCORE & CUSTOMER SEGMENTATION
Agenda
In this hackhathon,I am going to build a machine learning model on the finantial dataset to predict CreditRiskScore of the home loan borrowers , which is used by many housing finance company to find the quantum of additional finance which can be allowed based on the mortgage. . The credit risk scores are used to evaluate the potential risk posed by lending money to consumers and in turn they can take decisions to mitigate losses due to bad debt..
we have two datasets provides information about demographics and finantial payment history borrowers .
Objectives
to do exploratory Data Analysis using visualizations
to build the analytical framework to predict the credit risk score of each customer using the payment history and other demographic features.
to segment the customers which will help to understand their sensitivity to the interest rate or product features to improve retention ie., to reduce the default or transfer of loan to competition and to understand how to structure offers for better revenue.
Importing The Necessary Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')
import os
from imblearn.datasets import fetch_datasets
from kmeans_smote import KMeansSMOTE
from collections import Counter
from imblearn.datasets import fetch_datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from imblearn.over_sampling import SMOTE
from math import *
import pandas as pd
import numpy as np
import tensorflow as tf
import warnings
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import Imputer, StandardScaler
import matplotlib.pyplot as plt
from matplotlib import gridspec
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from keras.optimizers import Adam
from keras import backend as K
from tensorflow.python.client import device_lib
import lightgbm as lgb
import catboost as cb
import xgboost as xgb
import seaborn as sns
sns.set_style("whitegrid")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
warnings.filterwarnings('ignore')
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import preprocessing, model_selection, metrics, feature_selection
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn import neighbors, linear_model, svm, tree, ensemble
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from IPython.display import display, HTML
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')
import os
from imblearn.datasets import fetch_datasets
from kmeans_smote import KMeansSMOTE
from collections import Counter
from imblearn.datasets import fetch_datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from imblearn.over_sampling import SMOTE
from math import *
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
import warnings
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import Imputer, StandardScaler
import matplotlib.pyplot as plt
from matplotlib import gridspec
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from keras.optimizers import Adam
from keras import backend as K
from tensorflow.python.client import device_lib
import lightgbm as lgb
import catboost as cb
import xgboost as xgb
import seaborn as sns
sns.set_style("whitegrid")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
warnings.filterwarnings('ignore')
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import preprocessing, model_selection, metrics, feature_selection
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn import neighbors, linear_model, svm, tree, ensemble
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from IPython.display import display, HTML
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
import warnings
def ignore_warn(*args, **kwargs):
pass
warnings.warn = ignore_warn
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
from sklearn import preprocessing
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNetCV, ElasticNet
from xgboost import XGBRegressor, plot_importance
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))
Set Path
os.chdir("F:\insofe\PHD\data\ml")
pwd()
Creating a Pandas DataFrame from a CSV file
#raw train data 1
demographic_raw_dt=pd.read_excel("Train-1557019772293.xlsx")
#raw train data 2
payment_raw_dt=pd.read_excel("Train_PaymentHistory-1557019802161.xlsx")
# raw test demo data
test_demo_dt=pd.read_excel("Test-1557294637283.xlsx")
# raw test payment data
test_payment_dt=pd.read_excel("Test_PaymentHistory-1557294848030.xlsx")
# sample submission file
sample_file=pd.read_csv('samplesubmission-1557545918238.csv')
print("demographic dataset has {} samples with {} features each.".format(*demographic_raw_dt.shape))
print ("payment dataset has {} samples with {} features each.".format(*payment_raw_dt.shape))
demographic_raw_dt.head()
demographic_raw_dt.columns
demographic_raw_dt.dtypes
payment_raw_dt.head(30)
payment_raw_dt.dtypes
Columns in the dataset
CustomerID - unique no
Current_Instalment_Sequence -
Starting_Instalment
Maturity_Period-he time between when the bond is issued and when it matures in months.
Current_Outstanding
Current_Loan_to_Appraisedvalu_Percent
CurrentInterestrate
RealEstate_Current_Inflation - property rate in percentage
GDP -is defined as the market value of the goods and services produced by a country.
UnemploymentRate
Asset_type
Urban_Development
Villa_House
Investment_SelfOccupied-'Self Occupancy', 'Investment'
Starting_outstanding - total debt
Starting_Loan_to_Appraisedvalu_Percent -LTV ratio is 75% or lower, you could get a lower rate, because the loan is seen as less risky to the lender. If the value of the home increases after you close on your home purchase, you may be able to refinance to a lower interest rate.
StartingInterestrate
RealEstate_Starting_Inflation
Payment_Status -'Non-Payoff/Non-Default', 'Default', 'Payoff'
Salary
ProfessionalLicensure
UtilitySpending
eCommerceAccount
SocialMediaAccount
DOB
NoOfProperties- total properties of borrowers.
CreditRiskScore -calculate the creditworthiness of borrowers. score them bet 300-900. need more than 650 to egligible for loan.
Convert DOB into Age
demographic_raw_dt['DOB']=demographic_raw_dt['DOB'].astype('datetime64')
demographic_raw_dt['age'] = (pd.to_datetime('now') - demographic_raw_dt['DOB']).astype('<m8[Y]')
demographic_raw_dt['age'] = demographic_raw_dt['age'].astype('int')
Remove DOB
demographic_raw_dt=demographic_raw_dt.drop(axis=1,columns='DOB')
merge_dt=payment_raw_dt.merge(demographic_raw_dt, left_on='CustomerID', right_on='CustomerID')
merge_dt.columns
merge_dt.shape
merge_dt.head(5)
merge_dt.info()
merge_dt.describe(include='all').T.sort_values("count")
print(merge_dt.ProfessionalLicensure.unique())
print(merge_dt.eCommerceAccount.unique())
print(merge_dt.NoOfProperties.unique())
merge_dt['ProfessionalLicensure']=merge_dt['ProfessionalLicensure'].astype('object')
merge_dt['eCommerceAccount']=merge_dt['eCommerceAccount'].astype('object')
merge_dt['NoOfProperties']=merge_dt['NoOfProperties'].astype('object')
merge_dt.Urban_Development.unique()
merge_dt.describe(include=['O'])
merge_dt.describe(include=['float32','float64','int64','int32'])
Correlation Plot
import matplotlib.pyplot as plt
def data_corr(data):
correlation = data.corr()
fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(correlation, annot=True, cbar=True, cmap="RdYlGn")
num = list(merge_dt.select_dtypes(include=['float32','float64','int64','int32']).columns)
data_corr(merge_dt[num])
Check Missing Data
#function to find missing values
def miss_data(x):
total = x.isnull().sum().sort_values(ascending=False)
percent = (x.isnull().sum()/x.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.index.name = 'column_names'
missing_data.reset_index(inplace=True)
sns.set(style="whitegrid")
sns.set(rc={'figure.figsize':(20,8.27)})
sns.barplot(missing_data.column_names,missing_data.Percent, alpha=0.9)
print(plt.title('missing data plot'))
print(plt.ylabel('percentage of missing data', fontsize=12))
print(plt.xlabel('column names', fontsize=12))
print(plt.show())
print(missing_data)
miss_data(x=merge_dt)
Remove Duplicates
print('Entrées dupliquées: {}'.format(merge_dt.duplicated().sum()))
merge_dt.drop_duplicates(inplace = True)
merge_dt['Appraisal_value']=merge_dt['Starting_outstanding']/(merge_dt['Starting_Loan_to_Appraisedvalu_Percent']/100)
merge_dt['current_Appraisal_value']=merge_dt['Current_Outstanding']/(merge_dt['Current_Loan_to_Appraisedvalu_Percent']/100)
merge_dt['remaining_outstanding']= merge_dt['Starting_outstanding'] - merge_dt['Current_Outstanding']
new_data=merge_dt[['CustomerID', 'Starting_Instalment','Maturity_Period','Asset_type',
'Urban_Development', 'Villa_House', 'Investment_SelfOccupied',
'Starting_outstanding', 'Starting_Loan_to_Appraisedvalu_Percent',
'StartingInterestrate', 'RealEstate_Starting_Inflation',
'age', 'Salary', 'ProfessionalLicensure','Payment_Status',
'UtilitySpending', 'eCommerceAccount', 'SocialMediaAccount','Appraisal_value',
'NoOfProperties', 'CreditRiskScore']]
new_data=new_data.drop_duplicates()
Create Credit Class
credit_rate =[]
for i in range(len(new_data.CreditRiskScore)):
if new_data.CreditRiskScore.iloc[i] > 650:
credit_rate.append("high_score")
elif new_data.CreditRiskScore.iloc[i] < 650 and new_data.CreditRiskScore.iloc[i] > 450 :
credit_rate.append("Medium_score")
else:
credit_rate.append("low_score")
new_data['credit_class']=credit_rate
Univariate Analysis
def histplot(df):
cols = list(df.select_dtypes(include=['float32','float64','int64','int32']).columns)
for i in cols:
carrier_count= df[i].value_counts()
sns.set(style="darkgrid")
sns.distplot(new_data[i])
plt.title('Frequency Distribution of'+i)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel(i, fontsize=12)
print(plt.show())
histplot(new_data)
sns.distplot(merge_dt['Current_Loan_to_Appraisedvalu_Percent'])
sns.distplot(merge_dt['remaining_outstanding'])
sns.distplot(merge_dt['UnemploymentRate'])
sns.distplot(merge_dt['GDP'])
sns.distplot(merge_dt['CurrentInterestrate'])
sns.distplot(merge_dt['Current_Outstanding'])
sns.distplot(merge_dt['Starting_Instalment'])
sns.distplot(merge_dt['Starting_Instalment'])
sns.distplot(merge_dt['Current_Instalment_Sequence'])
def distributionplot(df):
cols = list(df.select_dtypes(include=['object']).columns)
for i in cols:
carrier_count= df[i].value_counts()
sns.set(style="darkgrid")
sns.barplot(carrier_count.index, carrier_count.values, alpha=0.9)
plt.title('Frequency Distribution of'+i)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel(i, fontsize=12)
print(plt.show())
merge_data=merge_dt
merge_data=merge_data.drop(axis=1,columns='CustomerID')
distributionplot(merge_data)
Bivariate Analysis
# Standard plotly imports
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
# Using plotly + cufflinks in offline mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
demographic_raw_dt.pivot(columns='age', values='CreditRiskScore').iplot(
kind='box',xTitle='age',
yTitle='CreditRiskScore',
title='CreditRiskScore Distribution by age ')
color_seq = []
for i in new_data.SocialMediaAccount:
if i =='Yes':
color_seq.append('red')
else:
color_seq.append('green')
plt.figure(figsize = (10,7))
plt.scatter(x = new_data.Salary,y = new_data.Maturity_Period,color = color_seq)
plt.title('Maturity_Period vs Salary')
plt.xlabel('Salary')
plt.ylabel('Maturity_Period')
plt.show()
new_data.iplot(
x='Maturity_Period',
y='age',
# Specify the category
categories='credit_class',
xTitle='Maturity_Period',
yTitle='age',
title='Maturity_Period vs age by credit_class')
new_data.iplot(
x='Starting_Instalment',
y='StartingInterestrate',
# Specify the category
categories='credit_class',
xTitle='Starting_Instalment',
yTitle='StartingInterestrate',
title='StartingInterestrate vs Starting_Instalment by credit_class ')
Univariate Analysis[OUTLIERS]
def boxplot(df):
cols = list(df.select_dtypes(include=['float32','float64','int64','int32']).columns)
for i in cols:
carrier_count= df[i].value_counts()
sns.set(style="darkgrid")
sns.boxplot(df[i])
plt.title('Frequency Distribution of'+i)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel((i), fontsize=12)
print(plt.show())
#'float32','float64','int64','int32'
boxplot(merge_dt)
#sns.boxplot(x=new_data['Starting_Instalment'])
Remove Outliers
merge_dt.loc[(merge_dt["Current_Outstanding"] >7000000 )]
merge_dt=merge_dt[merge_dt.CustomerID != 'C15226']
merge_dt.loc[(merge_dt["Salary"] > 200000 )]
merge_dt=merge_dt[merge_dt.CustomerID !='C18942']
merge_dt=merge_dt[merge_dt.CustomerID !='C19970']
merge_dt=merge_dt[merge_dt.CustomerID !='C21710']
merge_dt.loc[(merge_dt["Appraisal_value"] > 8.097117e+06 )]## showing nothing means its already dropped
def remove_outlier(df_in, col_name):
q1 = df_in[col_name].quantile(0.25)
q3 = df_in[col_name].quantile(0.75)
iqr = q3-q1 #Interquartile range
fence_low = q1-1.5*iqr
fence_high = q3+1.5*iqr
df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
return df_out
merge_dt=remove_outlier(df_in=merge_dt,col_name='GDP')
merge_dt=remove_outlier(df_in=merge_dt,col_name='Current_Instalment_Sequence')
merge_dt.shape
CLUSTER ANALYSIS
cluster_data=merge_dt
cluster_data=cluster_data.drop(axis=1,columns="CustomerID")
#cluster_data =merge_dt.set_index("CustomerID") #assign customer id as index to dataset to visualize the clusters
cluster_data.head(2)
cluster_data=cluster_data.drop(axis=1,columns='CreditRiskScore')
cluster_data=cluster_data.fillna(value=cluster_data.current_Appraisal_value.mean())
cluster_data['Appraisal_value']=cluster_data['Appraisal_value'].astype('int')
cluster_data['current_Appraisal_value']= cluster_data['current_Appraisal_value'].astype('int')
cluster_data['NoOfProperties']= cluster_data['NoOfProperties'].astype('object')
cat_cols = list(cluster_data.select_dtypes(include=['object']).columns)
num_cols = list(cluster_data.select_dtypes(include=['float64','float32','int32','int64']).columns)
cat_cols
Dummification
dummies_df=pd.get_dummies(cluster_data[cat_cols],drop_first=True)
numerical_data = cluster_data[num_cols]
numerical_data['index']=range(233335)
dummies_df['index']=range(233335)
final_data1=numerical_data.merge(dummies_df, left_on='index', right_on='index')
final_data1=final_data1.drop(axis=1,columns='index')
final_data1.to_csv('final_data.csv')
train_matrix1 = final_data1.as_matrix()
Sum_of_squared_distances = []
K = range(1,10)
for k in K:
km = KMeans(n_clusters=k)
km = km.fit(train_matrix1)
Sum_of_squared_distances.append(km.inertia_)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
create K-means cluster with k - 4
kmeans1 = KMeans(n_clusters = 4)
kmeans1.fit(train_matrix1)
clusters=kmeans1.labels_
cluster_class=clusters.tolist()
len(cluster_class)
final_data1.shape
cluster_dt=merge_dt
cluster_dt['Cluster_class']=cluster_class
cluster_dt.Cluster_class.value_counts()
0bservation- cluster 2 is small cluster than others
create credit class
credit_rate =[]
for i in range(len(cluster_dt.CreditRiskScore)):
if cluster_dt.CreditRiskScore.iloc[i] > 650:
credit_rate.append("high_score")
elif cluster_dt.CreditRiskScore.iloc[i] < 650 and cluster_dt.CreditRiskScore.iloc[i] > 450 :
credit_rate.append("Medium_score")
else:
credit_rate.append("low_score")
cluster_dt['credit_class']=credit_rate
create seperate dataframe for each cluster
cust_seg_1=cluster_dt[cluster_dt.Cluster_class == 0]
cust_seg_2=cluster_dt[cluster_dt.Cluster_class == 1]
cust_seg_3=cluster_dt[cluster_dt.Cluster_class == 2]
cust_seg_4=cluster_dt[cluster_dt.Cluster_class == 3]
################### 1 #########################################
cust_high_1=cust_seg_1[cust_seg_1.credit_class == 'high_score']
cust_medium_1=cust_seg_1[cust_seg_1.credit_class == 'Medium_score']
cust_low_1=cust_seg_1[cust_seg_1.credit_class == 'low_score']
################### 2 ###########################################
cust_high_2=cust_seg_2[cust_seg_2.credit_class == 'high_score']
cust_medium_2=cust_seg_2[cust_seg_2.credit_class == 'Medium_score']
cust_low_2=cust_seg_2[cust_seg_2.credit_class == 'low_score']
################### 3 ###########################################
cust_high_3=cust_seg_3[cust_seg_3.credit_class == 'high_score']
cust_medium_3=cust_seg_3[cust_seg_3.credit_class == 'Medium_score']
cust_low_3=cust_seg_3[cust_seg_3.credit_class == 'low_score']
################### 4 ###########################################
cust_high_4=cust_seg_4[cust_seg_4.credit_class == 'high_score']
cust_medium_4=cust_seg_4[cust_seg_4.credit_class == 'Medium_score']
cust_low_4=cust_seg_4[cust_seg_4.credit_class == 'low_score']
cluster_dt.credit_class.value_counts()
score_high = cust_high_1['CurrentInterestrate']
score_medium = cust_medium_1['CurrentInterestrate']
score_low = cust_low_1['CurrentInterestrate']
legend = ['High', 'Medium','Low']
plt.hist([score_high, score_medium ,score_low], color=['orange', 'green','blue'])
plt.xlabel("CurrentInterestrate")
plt.ylabel("Frequency")
plt.legend(legend)
#plt.xticks(range(0, 7))
#plt.yticks(range(1, 20))
plt.title('Histogram of CurrentInterestrate for clust 1')
plt.show()
score_high = cust_high_2['CurrentInterestrate']
score_medium = cust_medium_2['CurrentInterestrate']
score_low = cust_low_2['CurrentInterestrate']
legend = ['High', 'Medium','Low']
plt.hist([score_high, score_medium ,score_low], color=['orange', 'green','blue'])
plt.xlabel("CurrentInterestrate")
plt.ylabel("Frequency")
plt.legend(legend)
#plt.xticks(range(0, 7))
#plt.yticks(range(1, 20))
plt.title('Histogram of CurrentInterestrate for clust 2')
plt.show()
score_high = cust_high_3['CurrentInterestrate']
score_medium = cust_medium_3['CurrentInterestrate']
score_low = cust_low_3['CurrentInterestrate']
legend = ['High', 'Medium','Low']
plt.hist([score_high, score_medium ,score_low], color=['orange', 'green','blue'])
plt.xlabel("CurrentInterestrate")
plt.ylabel("Frequency")
plt.legend(legend)
#plt.xticks(range(0, 7))
#plt.yticks(range(1, 20))
plt.title('Histogram of CurrentInterestrate for clust 3')
plt.show()
score_high = cust_high_4['CurrentInterestrate']
score_medium = cust_medium_4['CurrentInterestrate']
score_low = cust_low_4['CurrentInterestrate']
legend = ['High', 'Medium','Low']
plt.hist([score_high, score_medium ,score_low], color=['orange', 'green','blue'])
plt.xlabel("CurrentInterestrate")
plt.ylabel("Frequency")
plt.legend(legend)
#plt.xticks(range(0, 7))
#plt.yticks(range(1, 20))
plt.title('Histogram of CurrentInterestrate for clust 3')
plt.show()
cust_seg_1[['CurrentInterestrate','CreditRiskScore']].describe()
cust_seg_2[['CurrentInterestrate','CreditRiskScore']].describe()
cust_seg_3[['CurrentInterestrate','CreditRiskScore']].describe()
cust_seg_4[['CurrentInterestrate','CreditRiskScore']].describe()
K-means clustering on PCA Data
pca = PCA()
pca.fit(train_matrix2)
pca_samples = pca.transform(scaled_matrix)
fig, ax = plt.subplots(figsize=(14, 5))
sns.set(font_scale=1)
plt.step(range(matrix.shape[1]), pca.explained_variance_ratio_.cumsum(), where='mid',
label='cumulative explained variance')
sns.barplot(np.arange(1,matrix.shape[1]+1), pca.explained_variance_ratio_, alpha=0.5, color = 'g',
label='individual explained variance')
plt.xlim(0, 31)
ax.set_xticklabels([s if int(s.get_text())%2 == 0 else '' for s in ax.get_xticklabels()])
plt.ylabel('Explained variance', fontsize = 14)
plt.xlabel('Principal components', fontsize = 14)
plt.legend(loc='best', fontsize = 13);
X_normalized = StandardScaler().fit(X_cancer).transform(X_cancer)
pca = PCA(n_components = 2).fit(X_normalized)
pca_df = pca.transform(X_normalized)
Sum_of_squared_distances = []
K = range(1,10)
for k in K:
km = KMeans(n_clusters=k,init='k-means++',n_init=100)
km = km.fit(pca_df)
Sum_of_squared_distances.append(km.inertia_)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
%matplotlib notebook
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from adspy_shared_utilities import plot_labelled_scatter
kmeans2 = KMeans(n_clusters = 3, random_state = 0)
kmeans2.fit(pca_df)
plot_labelled_scatter(pca_df, kmeans2.labels_,
['Cluster 1', 'Cluster 2', 'Cluster 3'])
BUILDING ML MODEL
7) Stacking-(Decision-Tree,SVM,Xg boost,RF,Light GBM,extra trees)
8) AdaBoost-AdaBoost)
11) XGBoost-XGBoost on PCA)
merge_dt.columns
Data Needs Grouping
merge_dt_1=merge_dt[['CustomerID','Current_Instalment_Sequence', 'Current_Outstanding','Current_Loan_to_Appraisedvalu_Percent','CurrentInterestrate',
'RealEstate_Current_Inflation','GDP', 'UnemploymentRate','current_Appraisal_value','remaining_outstanding']]
merge_dt_1.shape
Data Don't Need Grouping
merge_dt_2=merge_dt[['CustomerID', 'Starting_Instalment','Maturity_Period','Asset_type',
'Urban_Development', 'Villa_House', 'Investment_SelfOccupied',
'Starting_outstanding', 'Starting_Loan_to_Appraisedvalu_Percent',
'StartingInterestrate', 'RealEstate_Starting_Inflation',
'age', 'Salary', 'ProfessionalLicensure',
'UtilitySpending', 'eCommerceAccount', 'SocialMediaAccount','Appraisal_value',
'NoOfProperties', 'CreditRiskScore']]
merge_dt_2=merge_dt_2.drop_duplicates()# drop duplicates from dataset
merge_dt_2.shape
Function To Group The Data
First Approach
Second Approach
def feat_eng(data):
df = pd.DataFrame()
for col in data.columns:
if col in ['CustomerID']:
continue
df[col + '_mean'] = data.groupby(['CustomerID'])[col].mean()
df[col + '_median'] = data.groupby(['CustomerID'])[col].median()
df[col + '_max'] = data.groupby(['CustomerID'])[col].max()
df[col + '_min'] = data.groupby(['CustomerID'])[col].min()
df[col + '_std'] = data.groupby(['CustomerID'])[col].std()
#df[col + '_kurtosis'] = data.groupby(['CustomerID'])[col].kurtosis()
df[col + '_skew'] = data.groupby(['CustomerID'])[col].skew()
df[col + '_range'] = df[col + '_max'] - df[col + '_min']
df[col + '_kurtosis'] = data.groupby(['CustomerID'])[col].apply(lambda x: x.kurtosis())
df[col + '_maxtoMin'] = df[col + '_max'] / df[col + '_min']
#df[col + '_coeffvar'] = data.groupby(['CustomerID'])[col].apply(lambda x: np.std(x) / np.mean(x))
#in statistics, the median absolute deviation (MAD) is a robust measure of the variablility of a univariate sample of quantitative data.
df[col + '_meanAD'] = data.groupby(['CustomerID'])[col].apply(lambda x: np.mean(np.abs(np.diff(x))))
df[col + '_mad'] = data.groupby(['CustomerID'])[col].apply(lambda x: np.median(np.abs(np.diff(x))))
df[col + '_abs_max'] = data.groupby(['CustomerID'])[col].apply(lambda x: np.max(np.abs(x)))
df[col + '_abs_min'] = data.groupby(['CustomerID'])[col].apply(lambda x: np.min(np.abs(x)))
df[col + '_abs_avg'] = (df[col + '_abs_min'] + df[col + '_abs_max'])/2
return df
final_dt_1=feat_eng(data=merge_dt_1)
final_dt_1.head(10)
final_dt_1['CustomerID']=final_dt_1.index
final_dt_1.head(10)
final_dt_1.shape
final_dt_1.describe(include='all')
Join Two Column Merge dt 2 And final dt 1
train_data_1=final_dt_1.merge(merge_dt_2, left_on="CustomerID", right_on='CustomerID')
train_data_1.fillna(train_data_1['remaining_outstanding_maxtoMin'].median(),inplace=True)
train_data_1.isna().sum()
train_data_1[['eCommerceAccount','ProfessionalLicensure']]=train_data_1[['eCommerceAccount','ProfessionalLicensure']].astype('object')
#num1_cols = list(train_data_1.select_dtypes(include=['float64','float32','int32','int64']).columns)
#cat1_cols = list(train_data_1.select_dtypes(include=['object']).columns)
categorical_features =train_data_1.select_dtypes(include=['object']).columns
numerical_features =train_data_1.select_dtypes(include=['float64','float32','int32','int64']).columns
Feature Engg On Payment Status Column
new=merge_dt[['CustomerID','Current_Instalment_Sequence', 'Current_Outstanding','Current_Loan_to_Appraisedvalu_Percent','CurrentInterestrate',
'RealEstate_Current_Inflation','GDP', 'UnemploymentRate','current_Appraisal_value','remaining_outstanding','Payment_Status']]
new.Payment_Status.unique()
payment={'Non-Payoff/Non-Default': 2,'Payoff':4,'Default':0}
new['Payment_Status']=new['Payment_Status'].map(payment)
new['payment_total_score'] =[4]*new.shape[0]
payment_data=pd.DataFrame({'CustomerID':new.CustomerID,'Payment_Status':new.Payment_Status,'payment_total_score':new.payment_total_score})
def payment_eng1(data):
df = pd.DataFrame()
for col in data.columns:
if col in ['CustomerID']:
continue
df[col + '_count'] = data.groupby(['CustomerID'])[col].count()
df[col + '_sum'] = data.groupby(['CustomerID'])[col].sum()
return df
payment_data=payment_eng1(data=payment_data)
payment_data.head(5)
from scipy.stats import zscore
payment_percentile= payment_data.Payment_Status_sum/payment_data.payment_total_score_sum
payment_data['payment_z_score']=zscore(payment_percentile)
payment_data['payment_lenght']=payment_data.Payment_Status_count
print('size of train data',train_data_1.shape)
print('size of payment col',len(payment_percentile))
payment_data2=payment_data[['payment_z_score','payment_lenght']]
train_data_1=train_data_1.merge(payment_data2, left_on="CustomerID", right_on='CustomerID')
train_data_1.head(5)
CreditRiskScore
sns.distplot(train_data_1['CreditRiskScore'] , fit=norm);
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train_data_1['CreditRiskScore'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency')
plt.title('credit score distribution')
#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train_data_1['CreditRiskScore'], plot=plt)
plt.show();
sns.distplot(np.log1p(train_data_1['CreditRiskScore']) , fit=norm);
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(np.log1p(train_data_1['CreditRiskScore']))
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency')
plt.title('log(credit score+1) distribution')
#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(np.log1p(train_data_1['CreditRiskScore']), plot=plt)
plt.show();
Correlation Plot
a=train_data_1[numerical_features]
corr_with_credit_score = a.corr()['CreditRiskScore'].sort_values(ascending=False)
plt.figure(figsize=(25,10))
corr_with_credit_score.drop('CreditRiskScore').plot.bar()
plt.show();
train_data_1['CreditRiskScore'] = np.log1p(train_data_1['CreditRiskScore'])
#log transform skewed numeric features:
numeric_feats = train_data_1.dtypes[train_data_1.dtypes != "object"].index
skewed_feats = train_data_1[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
train_data_1[skewed_feats] = np.log1p(train_data_1[skewed_feats])
train_data_1[numeric_feats] = train_data_1[numeric_feats].apply(lambda x:pd.to_numeric(x)) #
train_data_1.drop(axis=1,columns='CustomerID',inplace=True)
df=train_data_1
df.head(5)
df = pd.get_dummies(df,drop_first=True)
df = df.fillna(df.mean())
df.isna().sum()
import pandas as pd
def clean_dataset(df):
assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
df.dropna(inplace=True)
indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
return df[indices_to_keep].astype(np.float64)
df1=clean_dataset(df)
df1.shape
Split Data
#X, y = df.drop(['CreditRiskScore'], axis = 1), df['CreditRiskScore']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
y= df1['CreditRiskScore']
X=df1.drop(['CreditRiskScore'], axis = 1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
BaseLine Model
import statsmodels.api as sm
import statsmodels.formula.api as smf
y=y_train
X=X_train
model = smf.OLS(y,X).fit()
predictions = model.predict(X)
print(model.summary())
test_prediction = model.predict(X_test)
print('Train r2 score: ', r2_score(predictions, y_train))
print('Test r2 score: ', r2_score(y_test, test_prediction))
train_mse1 = mean_squared_error(predictions, y_train)
test_mse1 = mean_squared_error(y_test, test_prediction)
train_rmse1 = np.sqrt(train_mse1)
test_rmse1 = np.sqrt(test_mse1)
print('Train RMSE: %.4f' % train_rmse1)
print('Test RMSE: %.4f' % test_rmse1)
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score
from itertools import product
def rmse_cv(model):
rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring="mean_squared_error", cv = 5))
return(rmse)
alphas = [0.0005, 0.001, 0.01, 0.03, 0.05, 0.1]
l1_ratios = [1.5, 1.1, 1, 0.9, 0.8, 0.7, 0.5]
cv_elastic = [rmse_cv(ElasticNet(alpha = alpha, l1_ratio=l1_ratio)).mean()
for (alpha, l1_ratio) in product(alphas, l1_ratios)]
elastic = ElasticNet(alpha=0.0005, l1_ratio=0.9)
elastic.fit(X_train, y_train)
#let's look at the residuals as well:
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":elastic.predict(X_train), "true":y_train})
preds["residuals"] = preds["true"] - preds["preds"]
preds.plot(x = "preds", y = "residuals",kind = "scatter")
rmse = np.sqrt(np.mean((preds['true']-preds['preds'])**2))
print ('RMSE: {0:.4f}'.format(rmse))
from sklearn.metrics import r2_score
print('R^2 train: %.3f' % r2_score(preds['true'], preds['preds']))
coef = pd.Series(elastic.coef_, index = X_train.columns)
imp_coef = pd.concat([coef.sort_values().head(50),
coef.sort_values().tail(10)])
feature_importance = pd.Series(index = X_train.columns, data = np.abs(elastic.coef_))
n_selected_features = (feature_importance>0).sum()
print('{0:d} features, reduction of {1:2.2f}%'.format(
n_selected_features,(1-n_selected_features/len(feature_importance))*100))
feature_importance.sort_values().tail(30).plot(kind = 'bar', figsize = (18,6))
As we want least RMSE lets move on Boosting models
xgb_model1 = XGBRegressor()
xgb_model1.fit(X_train, y_train, verbose=False)
y_train_pred1 = xgb_model1.predict(X_train)
y_pred1 = xgb_model1.predict(X_test)
print('Train r2 score: ', r2_score(y_train_pred1, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred1))
train_mse1 = mean_squared_error(y_train_pred1, y_train)
test_mse1 = mean_squared_error(y_pred1, y_test)
train_rmse1 = np.sqrt(train_mse1)
test_rmse1 = np.sqrt(test_mse1)
print('Train RMSE: %.4f' % train_rmse1)
print('Test RMSE: %.4f' % test_rmse1)
xgb_model2 = XGBRegressor(n_estimators=1000)
xgb_model2.fit(X_train, y_train, early_stopping_rounds=5,
eval_set=[(X_test, y_test)], verbose=False)
y_train_pred2 = xgb_model2.predict(X_train)
y_pred2 = xgb_model2.predict(X_test)
print('Train r2 score: ', r2_score(y_train_pred2, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred2))
train_mse2 = mean_squared_error(y_train_pred2, y_train)
test_mse2 = mean_squared_error(y_pred2, y_test)
train_rmse2 = np.sqrt(train_mse2)
test_rmse2 = np.sqrt(test_mse2)
print('Train RMSE: %.4f' % train_rmse2)
print('Test RMSE: %.4f' % test_rmse2)
xgb_model3 = XGBRegressor(n_estimators=1000, learning_rate=0.05)
xgb_model3.fit(X_train, y_train, early_stopping_rounds=5,
eval_set=[(X_test, y_test)], verbose=False)
y_train_pred3 = xgb_model3.predict(X_train)
y_pred3 = xgb_model3.predict(X_test)
print('Train r2 score: ', r2_score(y_train_pred3, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred3))
train_mse3 = mean_squared_error(y_train_pred3, y_train)
test_mse3 = mean_squared_error(y_pred3, y_test)
train_rmse3 = np.sqrt(train_mse3)
test_rmse3 = np.sqrt(test_mse3)
print('Train RMSE: %.4f' % train_rmse3)
print('Test RMSE: %.4f' % test_rmse3)
xgb_model4 = XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
colsample_bytree=1, max_depth=7, n_jobs=-1)
xgb_model4.fit(X_train,y_train)
y_train_pred4 = xgb_model4.predict(X_train)
y_pred4 = xgb_model4.predict(X_test)
print('Train r2 score: ', r2_score(y_train_pred4, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred4))
train_mse4 = mean_squared_error(y_train_pred4, y_train)
test_mse4 = mean_squared_error(y_pred4, y_test)
train_rmse4 = np.sqrt(train_mse4)
test_rmse4 = np.sqrt(test_mse4)
print('Train RMSE: %.4f' % train_rmse4)
print('Test RMSE: %.4f' % test_rmse4)
ON LEADERBOARD SCORE- 117.35
feature_important = xgb_model4.feature_importances_
keys = list( X_train.columns)
values = list(feature_important)
data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=True)
data.plot(kind='barh',figsize=(30,55),fontsize=25)
from collections import OrderedDict
OrderedDict(sorted(xgb_model4.get_booster().get_fscore().items(), key=lambda t: t[1], reverse=True))
most_relevant_features=list(dict((k, v) for k, v in xgb_model4.get_booster().get_fscore().items() if v >= 4).keys())
train_x=df[most_relevant_features]
train_y=df['CreditRiskScore']
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.2, random_state = 106)
xgb_model5 = XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
colsample_bytree=1, max_depth=7, n_jobs=-1)
xgb_model5.fit(X_train, y_train, early_stopping_rounds=5,
eval_set=[(X_test, y_test)], verbose=False)
y_train_pred5 = xgb_model5.predict(X_train)
y_pred5 = xgb_model5.predict(X_test)
print('Train r2 score: ', r2_score(y_train_pred5, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred5))
train_mse5 = mean_squared_error(y_train_pred5, y_train)
test_mse5 = mean_squared_error(y_pred5, y_test)
train_rmse5 = np.sqrt(train_mse5)
test_rmse5 = np.sqrt(test_mse5)
print('Train RMSE: %.4f' % train_rmse5)
print('Test RMSE: %.4f' % test_rmse5)
xgb1 = XGBRegressor()
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
'objective':['reg:linear'],
'learning_rate': [0.03, 0.05, 0.08,0.1,0.3], #so called `eta` value
'max_depth': [5, 6, 7],
'min_child_weight': [4,6],
'silent': [1],
'subsample': [0.3,0.5,0.7,0.9],
'colsample_bytree': [0.5,0.7,1],
'n_estimators': [500]}
xgb_grid = RandomizedSearchCV(xgb1,
parameters,
cv = 5,
n_jobs = 5,
verbose=True)
xgb_grid.fit(X_train, y_train,early_stopping_rounds=5,
eval_set=[(X_test, y_test)])
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)
train_x=df[most_relevant_features]
train_y=df['CreditRiskScore']
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.2, random_state = 0)
xgb_model6 = XGBRegressor(subsample= 0.9, silent= 1,
nthread= 4, n_estimators= 500, min_child_weight= 4,
max_depth= 6, learning_rate= 0.05, colsample_bytree= 1)
xgb_model6.fit(X_train, y_train, early_stopping_rounds=5,
eval_set=[(X_test, y_test)], verbose=False)
y_train_pred6 = xgb_model6.predict(X_train)
y_pred5 = xgb_model6.predict(X_test)
print('Train r2 score: ', r2_score(y_train_pred5, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred5))
train_mse5 = mean_squared_error(y_train_pred5, y_train)
test_mse5 = mean_squared_error(y_pred5, y_test)
train_rmse5 = np.sqrt(train_mse5)
test_rmse5 = np.sqrt(test_mse5)
print('Train RMSE: %.4f' % train_rmse5)
print('Test RMSE: %.4f' % test_rmse5)
ON LEADERBOARD SCORE- 113.55
#X, y = df.drop(['CreditRiskScore'], axis = 1), df['CreditRiskScore']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
y= df1['CreditRiskScore']
X=df1.drop(['CreditRiskScore'], axis = 1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range=(0, 1))
#sc = StandardScaler()
x_train = sc.fit_transform(X_train)
x_test = sc.transform(X_test)
#import required packages
from sklearn import neighbors
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
%matplotlib inline
rmse_val = [] #to store rmse values for different k
for K in range(20):
K = K+1
model = neighbors.KNeighborsRegressor(n_neighbors = K)
model.fit(x_train, y_train) #fit the model
pred=model.predict(x_test) #make prediction on test set
error = sqrt(mean_squared_error(y_test,pred)) #calculate rmse
rmse_val.append(error) #store rmse values
print('RMSE value for k= ' , K , 'is:', error)
#plotting the rmse values against k values
curve = pd.DataFrame(rmse_val) #elbow curve
curve.plot()
model = neighbors.KNeighborsRegressor(n_neighbors = 12)# best k value
model.fit(x_train, y_train) #fit the model
pred=model.predict(x_test)
knn_train=model.predict(x_train)
knn_test=model.predict(x_test)
print('Train r2 score: ', r2_score(knn_train, y_train))
print('Test r2 score: ', r2_score(y_test, knn_test))
train_mse5 = mean_squared_error(knn_train, y_train)
test_mse5 = mean_squared_error(knn_test, y_test)
train_rmse5 = np.sqrt(train_mse5)
test_rmse5 = np.sqrt(test_mse5)
print('Train RMSE: %.4f' % train_rmse5)
print('Test RMSE: %.4f' % test_rmse5)
ON LEADERBOARD SCORE- 119.63
#Creating a training set for modeling and validation set to check model performance
train_cat_dt=train_data_1
X = train_cat_dt.drop(['CreditRiskScore'], axis=1)
y = train_cat_dt.CreditRiskScore
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.7, random_state=1234)
categorical_features_indices = np.where(X_train.dtypes == np.object)[0]
#importing library and building model
from catboost import CatBoostRegressor
model=CatBoostRegressor(iterations=1000, depth=3, learning_rate=0.1, loss_function='RMSE',random_seed=42)
model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation),plot=True)
#X, y = df.drop(['CreditRiskScore'], axis = 1), df['CreditRiskScore']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
y= df1['CreditRiskScore']
X=df1.drop(['CreditRiskScore'], axis = 1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range=(0, 1))
#sc = StandardScaler()
x_train = sc.fit_transform(X_train)
x_test = sc.transform(X_test)
import numpy as np
import pandas as pd
# data precession
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
# model
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
class Ensemble(object):
def __init__(self, n_splits, stacker, base_models):
self.n_splits = n_splits
self.stacker = stacker
self.base_models = base_models
def fit_predict(self, X, y, T):
X = np.array(X)
y = np.array(y)
T = np.array(T)
folds = list(KFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))
S_train = np.zeros((X.shape[0], len(self.base_models)))
S_test = np.zeros((T.shape[0], len(self.base_models)))
for i, clf in enumerate(self.base_models):
S_test_i = np.zeros((T.shape[0], self.n_splits))
for j, (train_idx, test_idx) in enumerate(folds):
X_train = X[train_idx]
y_train = y[train_idx]
X_holdout = X[test_idx]
y_holdout = y[test_idx]
print ("Fit Model %d fold %d" % (i, j))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_holdout)[:]
S_train[test_idx, i] = y_pred
S_test_i[:, j] = clf.predict(T)[:]
S_test[:, i] = S_test_i.mean(axis=1)
# results = cross_val_score(self.stacker, S_train, y, cv=5, scoring='r2')
# print("Stacker score: %.4f (%.4f)" % (results.mean(), results.std()))
# exit()
self.stacker.fit(S_train, y)
res = self.stacker.predict(S_test)[:]
return res
# rf params
rf_params = {}
rf_params['n_estimators'] = 1000
rf_params['max_depth'] = 8
rf_params['min_samples_split'] = 100
rf_params['min_samples_leaf'] = 30
# xgb params
xgb_params = {}
xgb_params['n_estimators'] = 500
xgb_params['min_child_weight'] = 12
xgb_params['learning_rate'] = 0.12
xgb_params['max_depth'] = 6
xgb_params['subsample'] = 0.77
xgb_params['reg_lambda'] = 0.8
xgb_params['reg_alpha'] = 0.4
xgb_params['base_score'] = 0
#xgb_params['seed'] = 400
xgb_params['silent'] = 1
# lgb params
lgb_params = {}
lgb_params['n_estimators'] = 450
lgb_params['max_bin'] = 8
lgb_params['learning_rate'] = 0.037 # shrinkage_rate
lgb_params['metric'] = 'l1' # or 'mae'
lgb_params['sub_feature'] = 0.35
lgb_params['bagging_fraction'] = 0.85 # sub_row
lgb_params['bagging_freq'] = 40
lgb_params['num_leaves'] = 512 # num_leaf
lgb_params['min_data'] = 500 # min_data_in_leaf
lgb_params['min_hessian'] = 0.05 # min_sum_hessian_in_leaf
lgb_params['verbose'] = 0
lgb_params['feature_fraction_seed'] = 2
lgb_params['bagging_seed'] = 3
# XGB model
xgb_model = XGBRegressor(**xgb_params)
# lgb model
lgb_model = LGBMRegressor(**lgb_params)
# RF model
rf_model = RandomForestRegressor(**rf_params)
# ET model
et_model = ExtraTreesRegressor()
# SVR model
#SVM is too slow in more then 10000 set
#svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.05)
# DecsionTree model
dt_model = DecisionTreeRegressor()
# AdaBoost model
ada_model = AdaBoostRegressor(learning_rate=0.1, loss='square', n_estimators=1000)
stack = Ensemble(n_splits=5,
stacker=LinearRegression(),
base_models=(rf_model,lgb_model, ada_model,et_model,dt_model,xgb_model))
y_pred_stack = stack.fit_predict(x_train, y_train, x_test)
#print('Train r2 score: ', r2_score(xgb_pred_train, y_train))
print('Test r2 score: ', r2_score(y_pred_stack, y_test))
#train_mse5 = mean_squared_error(xgb_pred_train, y_train)
test_mse5 = mean_squared_error(y_pred_stack, y_test)
#train_rmse5 = np.sqrt(train_mse5)
test_rmse5 = np.sqrt(test_mse5)
#print('Train RMSE: %.4f' % train_rmse5)
print('Test RMSE: %.4f' % test_rmse5)
ON LEADERBOARD SCORE- 116.6
from sklearn.ensemble import AdaBoostRegressor
from sklearn.datasets import make_regression
X_train, y_train = make_regression(n_features=4, n_informative=2,
random_state=0, shuffle=False)
regr = AdaBoostRegressor(random_state=0, n_estimators=100,learning_rate=0.001)
regr.fit(X_train, y_train)
from sklearn.ensemble import AdaBoostRegressor
ada2=AdaBoostRegressor(n_estimators=500,learning_rate=0.001,random_state=1)
score=np.mean(cross_val_score(ada2,X_train,y_train,scoring='neg_mean_squared_error',cv=5,n_jobs=1))
score
from sklearn.decomposition import PCA
#scaled_matrix=X_train.as_matrix
matrix = X_train.as_matrix()
scaler = StandardScaler()
scaler.fit(matrix)
scaled_matrix = scaler.transform(matrix)
pca = PCA()
pca.fit(scaled_matrix)
pca_samples = pca.transform(scaled_matrix)
fig, ax = plt.subplots(figsize=(50,20))
sns.set(font_scale=1)
plt.step(range(matrix.shape[1]), pca.explained_variance_ratio_.cumsum(), where='mid',
label='cumulative explained variance')
sns.barplot(np.arange(1,matrix.shape[1]+1), pca.explained_variance_ratio_, alpha=0.5, color = 'g',
label='individual explained variance')
plt.xlim(0, 50)
ax.set_xticklabels([s if int(s.get_text())%2 == 0 else '' for s in ax.get_xticklabels()])
plt.ylabel('Explained variance', fontsize = 50)
plt.xlabel('Principal components', fontsize = 50)
plt.legend(loc='best', fontsize = 50);
#Principle component analysis
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
pca.fit(X_train)
pca_X_train=pca.transform(X_train)
pca_X_test=pca.transform(X_test)
X_train= pca_X_train
X_test= pca_X_test
xgb_model7 = XGBRegressor(subsample= 0.9, silent= 1,
nthread= 4, n_estimators= 500, min_child_weight= 4,
max_depth= 6, learning_rate= 0.05, colsample_bytree= 1)
xgb_model7.fit(X_train, y_train, early_stopping_rounds=5,
eval_set=[(X_test, y_test)], verbose=False)
y_train_pred7 = xgb_model7.predict(X_train)
y_pred7 = xgb_model7.predict(X_test)
print('Train r2 score: ', r2_score(y_train_pred7, y_train))
print('Test r2 score: ', r2_score(y_test, y_pred7))
train_mse5 = mean_squared_error(y_train_pred7, y_train)
test_mse5 = mean_squared_error(y_pred7, y_test)
train_rmse5 = np.sqrt(train_mse5)
test_rmse5 = np.sqrt(test_mse5)
print('Train RMSE: %.4f' % train_rmse5)
print('Test RMSE: %.4f' % test_rmse5)
Test Data
print("test_demographic dataset has {} samples with {} features each.".format(*test_demo_dt.shape))
print ("test_payment dataset has {} samples with {} features each.".format(*test_payment_dt.shape))
test_demo_dt.head()
test_demo_dt.columns
test_demo_dt.dtypes
test_payment_dt.head(30)
test_payment_dt.dtypes
test_demo_dt['DOB']=test_demo_dt['DOB'].astype('datetime64')
test_demo_dt['age'] = (pd.to_datetime('now') - test_demo_dt['DOB']).astype('<m8[Y]')
test_demo_dt['age'] = test_demo_dt['age'].astype('int')
test_demo_dt=test_demo_dt.drop(axis=1,columns='DOB')
test_merge_dt=test_payment_dt.merge(test_demo_dt, left_on='CustomerID', right_on='CustomerID')
test_merge_dt.columns
test_merge_dt.head()
test_merge_dt.shape
sample_file.shape
test_merge_dt.head(5)
test_merge_dt.info()
test_merge_dt.describe(include='all').T.sort_values("count")
print(test_merge_dt.ProfessionalLicensure.unique())
print(test_merge_dt.eCommerceAccount.unique())
print(test_merge_dt.NoOfProperties.unique())
test_merge_dt['ProfessionalLicensure']=test_merge_dt['ProfessionalLicensure'].astype('object')
test_merge_dt['eCommerceAccount']=test_merge_dt['eCommerceAccount'].astype('object')
test_merge_dt['NoOfProperties']=test_merge_dt['NoOfProperties'].astype('object')
test_merge_dt.Urban_Development.unique()
test_merge_dt.describe(include=['O'])
test_merge_dt.describe(include=['float32','float64','int64','int32'])
#function to find missing values
def miss_data(x):
total = x.isnull().sum().sort_values(ascending=False)
percent = (x.isnull().sum()/x.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.index.name = 'column_names'
missing_data.reset_index(inplace=True)
sns.set(style="whitegrid")
sns.set(rc={'figure.figsize':(20,8.27)})
sns.barplot(missing_data.column_names,missing_data.Percent, alpha=0.9)
print(plt.title('missing data plot'))
print(plt.ylabel('percentage of missing data', fontsize=12))
print(plt.xlabel('column names', fontsize=12))
print(plt.show())
print(missing_data)
miss_data(x=test_merge_dt)
test_merge_dt['Appraisal_value']=test_merge_dt['Starting_outstanding']/(test_merge_dt['Starting_Loan_to_Appraisedvalu_Percent']/100)
test_merge_dt['current_Appraisal_value']=test_merge_dt['Current_Outstanding']/(test_merge_dt['Current_Loan_to_Appraisedvalu_Percent']/100)
test_merge_dt['remaining_outstanding']= test_merge_dt['Starting_outstanding'] - test_merge_dt['Current_Outstanding']
test_merge_dt.columns
test_matrix=test_merge_dt.as_matrix()
kmeans2.fit_predict(test_matrix)
clusters=kmeans2.labels_
test_cluster_class=clusters.tolist()
Data Needs Grouping
test_merge_dt_1=test_merge_dt[['CustomerID','Current_Instalment_Sequence', 'Current_Outstanding','Current_Loan_to_Appraisedvalu_Percent','CurrentInterestrate',
'RealEstate_Current_Inflation','GDP', 'UnemploymentRate','current_Appraisal_value','remaining_outstanding']]
test_merge_dt_1.shape
Data Don't Need Grouping
test_merge_dt_2=test_merge_dt[['CustomerID', 'Starting_Instalment','Maturity_Period','Asset_type',
'Urban_Development', 'Villa_House', 'Investment_SelfOccupied',
'Starting_outstanding', 'Starting_Loan_to_Appraisedvalu_Percent',
'StartingInterestrate', 'RealEstate_Starting_Inflation',
'age', 'Salary', 'ProfessionalLicensure',
'UtilitySpending', 'eCommerceAccount', 'SocialMediaAccount','Appraisal_value',
'NoOfProperties']]
test_merge_dt_2=test_merge_dt_2.drop_duplicates()# drop duplicates from dataset
test_merge_dt_2.shape
Function To Group The Data
def feat_eng(data):
df = pd.DataFrame()
for col in data.columns:
if col in ['CustomerID']:
continue
df[col + '_mean'] = data.groupby(['CustomerID'])[col].mean()
df[col + '_median'] = data.groupby(['CustomerID'])[col].median()
df[col + '_max'] = data.groupby(['CustomerID'])[col].max()
df[col + '_min'] = data.groupby(['CustomerID'])[col].min()
df[col + '_std'] = data.groupby(['CustomerID'])[col].std()
#df[col + '_kurtosis'] = data.groupby(['CustomerID'])[col].kurtosis()
df[col + '_skew'] = data.groupby(['CustomerID'])[col].skew()
df[col + '_range'] = df[col + '_max'] - df[col + '_min']
df[col + '_kurtosis'] = data.groupby(['CustomerID'])[col].apply(lambda x: x.kurtosis())
df[col + '_maxtoMin'] = df[col + '_max'] / df[col + '_min']
#df[col + '_coeffvar'] = data.groupby(['CustomerID'])[col].apply(lambda x: np.std(x) / np.mean(x))
#in statistics, the median absolute deviation (MAD) is a robust measure of the variablility of a univariate sample of quantitative data.
df[col + '_meanAD'] = data.groupby(['CustomerID'])[col].apply(lambda x: np.mean(np.abs(np.diff(x))))
df[col + '_mad'] = data.groupby(['CustomerID'])[col].apply(lambda x: np.median(np.abs(np.diff(x))))
df[col + '_abs_max'] = data.groupby(['CustomerID'])[col].apply(lambda x: np.max(np.abs(x)))
df[col + '_abs_min'] = data.groupby(['CustomerID'])[col].apply(lambda x: np.min(np.abs(x)))
df[col + '_abs_avg'] = (df[col + '_abs_min'] + df[col + '_abs_max'])/2
return df
test_final_dt_1=feat_eng(data=test_merge_dt_1)
test_final_dt_1.head(10)
test_final_dt_1['CustomerID']=test_final_dt_1.index
test_final_dt_1.head(10)
test_final_dt_1.shape
test_final_dt_1.describe(include='all')
Join Two Column test_Merge dt 2 And test_final dt 1
test_train_data_1=test_final_dt_1.merge(test_merge_dt_2, left_on="CustomerID", right_on='CustomerID')
test_train_data_1.fillna(test_train_data_1['remaining_outstanding_maxtoMin'].median(),inplace=True)
test_train_data_1.isna().sum()
test_train_data_1[['eCommerceAccount','ProfessionalLicensure']]=test_train_data_1[['eCommerceAccount','ProfessionalLicensure']].astype('object')
#num1_cols = list(train_data_1.select_dtypes(include=['float64','float32','int32','int64']).columns)
#cat1_cols = list(train_data_1.select_dtypes(include=['object']).columns)
categorical_features =test_train_data_1.select_dtypes(include=['object']).columns
numerical_features =test_train_data_1.select_dtypes(include=['float64','float32','int32','int64']).columns
Feature Engg On Payment Status Column
new=test_merge_dt[['CustomerID','Current_Instalment_Sequence', 'Current_Outstanding','Current_Loan_to_Appraisedvalu_Percent','CurrentInterestrate',
'RealEstate_Current_Inflation','GDP', 'UnemploymentRate','current_Appraisal_value','remaining_outstanding','Payment_Status']]
new.Payment_Status.unique()
payment={'Non-Payoff/Non-Default': 2,'Payoff':4,'Default':0}
new['Payment_Status']=new['Payment_Status'].map(payment)
new['payment_total_score'] =[4]*new.shape[0]
payment_data=pd.DataFrame({'CustomerID':new.CustomerID,'Payment_Status':new.Payment_Status,'payment_total_score':new.payment_total_score})
def payment_eng1(data):
df = pd.DataFrame()
for col in data.columns:
if col in ['CustomerID']:
continue
df[col + '_count'] = data.groupby(['CustomerID'])[col].count()
df[col + '_sum'] = data.groupby(['CustomerID'])[col].sum()
return df
payment_data=payment_eng1(data=payment_data)
payment_data.head(5)
from scipy.stats import zscore
payment_percentile= payment_data.Payment_Status_sum/payment_data.payment_total_score_sum
payment_data['payment_z_score']=zscore(payment_percentile)
payment_data['payment_lenght']=payment_data.Payment_Status_count
print('size of train data',train_data_1.shape)
print('size of payment col',len(payment_percentile))
payment_data2=payment_data[['payment_z_score','payment_lenght']]
test_train_data_1=test_train_data_1.merge(payment_data2, left_on="CustomerID", right_on='CustomerID')
test_train_data_1.head(5)
#log transform skewed numeric features:
numeric_feats = test_train_data_1.dtypes[test_train_data_1.dtypes != "object"].index
skewed_feats = test_train_data_1[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
test_train_data_1[skewed_feats] = np.log1p(test_train_data_1[skewed_feats])
test_train_data_1[numeric_feats] = test_train_data_1[numeric_feats].apply(lambda x:pd.to_numeric(x)) #
test_train_data_1.drop(axis=1,columns='CustomerID',inplace=True)
test_df=test_train_data_1
test_df.head(5)
test_df = pd.get_dummies(test_df,drop_first=True)
test_df = test_df.fillna(test_df.mean())
test_df.shape
submission1= pd.read_csv('samplesubmission-1557545918238.csv')
test_dtt=test_df[most_relevant_features]
submission1.head(5)
predict_lmTest_ss = xgb_model6.predict(test_dtt)
predict_test=np.exp(predict_lmTest_ss)
submission1['CreditRiskScore'] = predict_test
submission1.head()
submission1.to_csv('xgb22.csv', index=False)
| Model | Validation | Test |
|---|---|---|
| XG Boost5 | 0.13 | 113.55 |
| MLR | 0.15 | |
| Elastic net | 0.16 | |
| XG Boost4 | 0.1355 | 117.35 |
| CAT Boost | 0.13 | 168.55 |
| KNN | 0.16 | 119.79 |
| Stacking | 0.1291 | 116.6 |
| XG boost(pca) | 0.1311 |
XG Boost perform better than others on this dataset